!pip install pymongo lifelines pandas
!pip install maxminddb-geolite2 folium
import pandas as pd
from pymongo import MongoClient
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from geolite2 import geolite2
import folium
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
import plotly.graph_objs as go
from scipy import stats
from sklearn.ensemble import IsolationForest
from scipy.stats import ttest_ind, mannwhitneyu
#connecting the database
client = MongoClient('mongodb://localhost:27017/')
# select the database and collection
db = client['major']
collection = db['data']
data = collection.find()
nodes=[]
messages=[]
for d in data:
d.pop('_id') # remove the '_id' field from each document
if 'port' in d:
nodes.append(d)
else:
messages.append(d)
df = pd.DataFrame(messages)
df = pd.concat([df.drop(['payload'], axis=1), df['payload'].apply(pd.Series)], axis=1)
df
| sender | receiver | date | temperature | wind | speed | |
|---|---|---|---|---|---|---|
| 0 | 832a1898-0575-4d8d-b961-bae784b6dd0d | 68703395-5a6a-4620-aecb-88a47253a781 | 2022-12-28 10:07:31 | 24 | 9 | 157 |
| 1 | b1bfb6ed-b2e7-46b3-ac2e-2ee5e14c0799 | 0ae2f8c1-ac4f-417b-a9ef-80173aea3a79 | 2022-10-28 02:35:52 | 23 | 1 | 122 |
| 2 | 2ca1f8d3-b7bc-4e71-91b7-98c0ed150783 | e70fb74d-5beb-485a-a37e-0658af2a3cbe | 2022-12-18 10:38:19 | 27 | 9 | 179 |
| 3 | 8eea24ec-6941-4b3f-8337-7999f5530602 | 0721080a-0ddf-46c1-a6bf-b5cc6df41861 | 2023-01-20 02:03:36 | 29 | 7 | 195 |
| 4 | 42b5e43c-dbf0-4696-bfb8-15a4efbba101 | 028a248c-52c2-4c7c-a387-aaf5f658331b | 2022-10-21 15:11:48 | 23 | 1 | 185 |
| ... | ... | ... | ... | ... | ... | ... |
| 595 | a9aba850-a62a-4434-a628-d2c8c4c3fdd6 | 8a712d8b-5453-47ba-8d64-abf1d11d1c1b | 2023-01-15 22:02:32 | 27 | 14 | 105 |
| 596 | 0c81bc4a-db3b-412f-8514-afd31a96913d | d91286d0-b637-472d-947e-f3940fde6844 | 2023-01-23 04:48:57 | 27 | 1 | 186 |
| 597 | 62d65c20-33ec-426a-b089-9fb4d7d6a051 | 158130d9-d724-4987-bc7a-b14c26b97252 | 2023-01-01 22:05:55 | 26 | 0 | 135 |
| 598 | 96096f13-34c6-48d8-a556-30fb2c851220 | 6754c66b-1fd7-4657-a4ca-f22575921f82 | 2022-11-21 13:37:12 | 22 | 16 | 194 |
| 599 | c5f82214-7c9f-49a3-9da9-eb2c42e98e29 | d56f3755-83d3-4d6c-a14b-d70dfe424a27 | 2022-12-27 01:34:06 | 26 | 16 | 198 |
600 rows × 6 columns
df1 = pd.DataFrame(nodes)
df1
| id | port | addr | peers | honest | |
|---|---|---|---|---|---|
| 0 | b1bfb6ed-b2e7-46b3-ac2e-2ee5e14c0799 | 29464 | 181.116.72.251 | [131.46.126.190, 137.218.170.21, 166.165.61.15... | true |
| 1 | 84d73484-be89-4dba-9a2d-5906218690e6 | 36666 | 28.46.162.194 | [105.122.9.123, 51.49.234.131, 38.152.179.69, ... | true |
| 2 | 7096e26b-e4ad-4372-b8c1-ad678af802cf | 27253 | 137.218.170.21 | [23.104.239.48, 176.127.36.206, 51.49.234.131,... | true |
| 3 | 0998bce7-f2a6-4209-8f27-372d5dad026c | 35977 | 49.171.135.105 | [227.232.138.74, 176.109.174.69, 87.101.29.237... | true |
| 4 | 8ccff98e-b2d0-4eef-9e6f-25b9cf12e5fe | 25112 | 184.193.5.40 | [176.127.36.206, 8.178.157.69, 233.167.126.226... | true |
| ... | ... | ... | ... | ... | ... |
| 595 | f65090be-293c-461c-89d0-4aeed84cecf1 | 29832 | 5.23.123.237 | [214.194.1.209, 234.82.244.37, 211.7.121.116, ... | true |
| 596 | f24d7eef-9f50-4d69-bc27-683c4cc69d15 | 21501 | 84.107.249.36 | [85.252.182.39, 186.34.67.36, 3.37.19.199, 121... | true |
| 597 | 66a38013-24fd-4806-8205-bd0a5bb96093 | 33913 | 189.71.8.217 | [180.231.161.196, 242.76.202.173, 5.23.123.237... | true |
| 598 | bf0b0a77-79e5-4e16-97a5-9bdc4f14fd44 | 25133 | 202.15.10.205 | [59.56.4.168, 22.133.131.87, 12.85.125.43, 147... | true |
| 599 | 7af3d73c-5318-438f-bae4-89ecfb0a7e0c | 31701 | 196.63.212.40 | [32.199.106.14, 124.108.203.173, 68.205.186.33... | true |
600 rows × 5 columns
merged_df = pd.merge(df, df1, left_on='sender', right_on='id')
merged_df=merged_df.drop(['id'],axis=1)
merged_df
| sender | receiver | date | temperature | wind | speed | port | addr | peers | honest | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 832a1898-0575-4d8d-b961-bae784b6dd0d | 68703395-5a6a-4620-aecb-88a47253a781 | 2022-12-28 10:07:31 | 24 | 9 | 157 | 32692 | 66.13.28.163 | [197.143.62.237, 233.167.126.226, 54.199.60.13... | true |
| 1 | 832a1898-0575-4d8d-b961-bae784b6dd0d | ed59c1df-f324-4cf2-9f97-4a75a1a1cfd0 | 2022-11-17 15:24:22 | 30 | 1 | 120 | 32692 | 66.13.28.163 | [197.143.62.237, 233.167.126.226, 54.199.60.13... | true |
| 2 | 832a1898-0575-4d8d-b961-bae784b6dd0d | 97d71e1b-7861-4d75-82ac-f651c5268ce4 | 2022-10-11 00:00:51 | 25 | 7 | 183 | 32692 | 66.13.28.163 | [197.143.62.237, 233.167.126.226, 54.199.60.13... | true |
| 3 | 832a1898-0575-4d8d-b961-bae784b6dd0d | 2fd37557-de7d-4607-876f-45703e7020dd | 2022-12-24 16:52:39 | 27 | 14 | 188 | 32692 | 66.13.28.163 | [197.143.62.237, 233.167.126.226, 54.199.60.13... | true |
| 4 | 832a1898-0575-4d8d-b961-bae784b6dd0d | a26cab79-a60f-4675-aadb-cdcb3c0c3d3c | 2022-12-27 06:25:43 | 28 | 14 | 162 | 32692 | 66.13.28.163 | [197.143.62.237, 233.167.126.226, 54.199.60.13... | true |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 595 | b770885b-7b27-40f5-96a9-2e489632ee7f | 158130d9-d724-4987-bc7a-b14c26b97252 | 2022-09-13 23:34:52 | 28 | 0 | 161 | 35785 | 22.133.131.87 | [217.154.191.84, 150.24.66.160, 87.196.152.101... | true |
| 596 | 43e77632-c742-497f-a4c2-293e77d8c813 | 67835d5b-8330-4001-b360-92a822f8164c | 2023-01-26 02:17:14 | 29 | 5 | 139 | 32690 | 249.138.143.121 | [153.97.177.85, 235.226.18.8, 214.10.134.179, ... | true |
| 597 | f45b8b9e-e46e-442d-b12b-c8d2827e0df6 | 510dfdf7-e0a3-4d8c-9e13-a2f849f00f12 | 2023-01-04 06:59:20 | 27 | 0 | 168 | 25344 | 214.10.134.179 | [20.46.174.135, 87.196.152.101, 64.229.137.249... | true |
| 598 | a9aba850-a62a-4434-a628-d2c8c4c3fdd6 | 8a712d8b-5453-47ba-8d64-abf1d11d1c1b | 2023-01-15 22:02:32 | 27 | 14 | 105 | 20459 | 28.97.2.20 | [7.91.194.236, 201.116.9.178, 225.140.65.179, ... | true |
| 599 | 96096f13-34c6-48d8-a556-30fb2c851220 | 6754c66b-1fd7-4657-a4ca-f22575921f82 | 2022-11-21 13:37:12 | 22 | 16 | 194 | 31264 | 129.28.19.33 | [201.116.9.178, 215.91.254.245, 44.143.239.75,... | true |
600 rows × 10 columns
Correlation refers to the strength of a relationship between two variables, and is typically measured by a correlation coefficient. In the context of a DataFrame, correlation can be used to determine whether there is a linear relationship between two columns of data.
The correlation coefficient, which is a value between -1 and 1, indicates the strength and direction of the relationship between two variables. A correlation coefficient of 1 indicates a perfect positive relationship, where one variable increases as the other variable increases. A correlation coefficient of -1 indicates a perfect negative relationship, where one variable decreases as the other variable increases. A correlation coefficient of 0 indicates no relationship between the two variables.
def correlation(column1,column2):
correlation = merged_df[column1].corr(merged_df[column2])
return(f"Correlation between {column1} and {column2}:", correlation)
correlation('temperature','wind')
('Correlation between temperature and wind:', 0.09709881763028405)
correlation('wind','speed')
('Correlation between wind and speed:', 0.08819521317235023)
correlation('temperature','speed')
('Correlation between temperature and speed:', 0.050831862956256545)
frequency_table = merged_df.groupby(['sender', 'receiver']).size().reset_index(name='count')
print("Frequency table of sender-receiver pairs:")
frequency_table
Frequency table of sender-receiver pairs:
| sender | receiver | count | |
|---|---|---|---|
| 0 | 0385da65-9ab0-46ac-a6d2-462041bf1fe1 | 08f8f0e5-8f6e-4893-be5d-23276887a98e | 1 |
| 1 | 0385da65-9ab0-46ac-a6d2-462041bf1fe1 | fd59b68b-1b09-4b0b-b167-ac7d9d11d6b8 | 1 |
| 2 | 040a8064-3026-4249-82b8-75c8a1aae388 | aad31313-444b-4477-9df1-7806b0ae31af | 1 |
| 3 | 06d4ce0d-44bc-430d-807f-c79765424e6b | 847a2241-ea9c-4755-ae12-6e1e56937fc3 | 1 |
| 4 | 0721080a-0ddf-46c1-a6bf-b5cc6df41861 | 42b5e43c-dbf0-4696-bfb8-15a4efbba101 | 1 |
| ... | ... | ... | ... |
| 594 | fb2839d3-a266-46e6-a048-823b6706b1f6 | 46b360be-fc07-47fa-a9dc-fdc3bfab313e | 1 |
| 595 | fb2839d3-a266-46e6-a048-823b6706b1f6 | 65515120-efbd-42c7-a78c-d106e2d50818 | 1 |
| 596 | fde2d368-f04d-4072-b04e-2980b0583999 | 1f622d8e-f2d2-46a4-a1ad-87eb2cbd2aa0 | 1 |
| 597 | fde2d368-f04d-4072-b04e-2980b0583999 | e90c46d7-deb8-4061-82c2-634bd21a0f8b | 1 |
| 598 | fee60cd9-ff19-4f84-a156-03d99a91006e | 9c6b6086-7846-4693-84ae-0888fda2892a | 1 |
599 rows × 3 columns
# Convert the date column to datetime format
merged_df['date'] = pd.to_datetime(merged_df['date'])
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 8))
merged_df.plot(x='date', y='temperature', marker='o', ax=axes[0])
merged_df.plot(x='date', y='wind', marker='o', ax=axes[1])
merged_df.plot(x='date', y='speed', marker='o', ax=axes[2])
# Set the title and labels for the plots
axes[0].set_title('Temperature over Time')
axes[1].set_title('Wind over Time')
axes[2].set_title('Speed over Time')
fig.suptitle('Trends in Temperature, Wind, and Speed over Time')
plt.xlabel('Date')
plt.tight_layout()
plt.show()
# Extract month and year information from the date
merged_df['month'] = merged_df['date'].dt.month
merged_df['year'] = merged_df['date'].dt.year
# Create boxplots for temperature and wind by month
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.boxplot(x='month', y='temperature', data=merged_df, ax=axes[0])
axes[0].set_title('Seasonal Patterns in Temperature')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Temperature')
sns.boxplot(x='month', y='wind', data=merged_df, ax=axes[1])
axes[1].set_title('Seasonal Patterns in Wind')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Wind')
plt.show()
# Create a directed graph using NetworkX
G = nx.DiGraph()
# Add edges to the graph for each sender-receiver pair
for sender, receiver in zip(merged_df['sender'], merged_df['receiver']):
G.add_edge(sender, receiver)
# Calculate the most frequently used sender and receiver addresses
most_frequent_senders = merged_df['sender'].value_counts().head(5)
most_frequent_receivers = merged_df['receiver'].value_counts().head(5)
# Create a subgraph of the most frequently used sender and receiver addresses
subgraph_nodes = set(most_frequent_senders.index) | set(most_frequent_receivers.index)
subgraph = G.subgraph(subgraph_nodes)
# Set node size based on degree centrality
node_sizes = [10000 * subgraph.degree(node) for node in subgraph.nodes()]
# Set node colors based on degree centrality
node_colors = [subgraph.degree(node) for node in subgraph.nodes()]
# Draw the network graph using Matplotlib
pos = nx.spring_layout(subgraph, k=5)
nx.draw_networkx_nodes(subgraph, pos, node_color=node_colors, node_size=node_sizes)
nx.draw_networkx_edges(subgraph, pos)
nx.draw_networkx_labels(subgraph, pos)
plt.axis('off')
plt.show()
# Create a directed graph using NetworkX
df_half = merged_df.sample(frac=0.05)
G = nx.DiGraph()
# Add edges to the graph for each sender-receiver pair
for sender, receiver, date, port in zip(df_half['sender'], df_half['receiver'], df_half['date'], df_half['port']):
G.add_edge(sender, receiver, date=date, port=port)
# Add edges to the graph for nodes that communicated on the same date
for date in df_half['date'].unique():
nodes_on_date = set(df_half[df_half['date'] == date]['sender']).union(set(df_half[df_half['date'] == date]['receiver']))
for node1 in nodes_on_date:
for node2 in nodes_on_date:
if node1 != node2 and not G.has_edge(node1, node2):
G.add_edge(node1, node2, date=date)
# Add edges to the graph for nodes that communicated on the same port
for port in df_half['port'].unique():
nodes_on_port = set(df_half[df_half['port'] == port]['sender']).union(set(df_half[df_half['port'] == port]['receiver']))
for node1 in nodes_on_port:
for node2 in nodes_on_port:
if node1 != node2 and not G.has_edge(node1, node2):
G.add_edge(node1, node2, port=port)
# Draw the network graph using Matplotlib
pos = nx.spring_layout(G, k=0.5)
nx.draw_networkx(G, pos, with_labels=True, node_size=500, font_size=8)
plt.axis('off')
plt.show()
port_counts = merged_df['port'].value_counts()
# Create a bar chart of the distribution of port numbers using Plotly Express
fig = px.bar(y=port_counts.index, x=port_counts.values, labels={'y': 'Port', 'x': 'Frequency'},
title='Distribution of Port Numbers',template='plotly_white')
# Show the plot
fig.show()
Preprocessing the data: a. Converting the date column to datetime format. b. Extracting relevant features from the date, such as day of the week and month. c. Encoding the categorical feature (sender) using label encoding.
Defining the features (X) and target (y) for the prediction task. In this example, the target variable is 'temperature', and the features include the day of the week, month, wind, speed, and the encoded sender information.
Splitting the dataset into a training set and a testing set. The training set is used to train the machine learning model, while the testing set is used to evaluate its performance.
Training a RandomForestRegressor model on the training set. The RandomForestRegressor is an ensemble learning method for regression tasks that operate by constructing multiple decision trees during training and outputting the mean prediction of the individual trees.
Predicting temperature values for the test set using the trained RandomForestRegressor model.
Calculating the mean absolute error (MAE) of the predictions. The MAE is a measure of the average difference between the true temperature values and the predicted values in the test set. It gives an idea of how well the model performs on unseen data.
# Convert the date column to datetime format and extract relevant features
merged_df['date'] = pd.to_datetime(merged_df['date'])
merged_df['day_of_week'] = merged_df['date'].dt.dayofweek
merged_df['month'] = merged_df['date'].dt.month
# Encode the categorical feature (sender)
encoder = LabelEncoder()
merged_df['sender_encoded'] = encoder.fit_transform(merged_df['sender'])
# Define the features (X) and target (y)
X = merged_df[['day_of_week', 'month', 'wind', 'speed', 'sender_encoded']]
y = merged_df['temperature']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Predict temperature using the test set
y_pred = model.predict(X_test)
# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')
Mean Absolute Error: 2.7030000000000003
# Define the features (X) and target (y)
X = merged_df[['day_of_week', 'month', 'temperature', 'wind', 'speed', 'sender_encoded']]
y = merged_df['honest']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the RandomForestClassifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Predict the honest attribute using the test set
y_pred = model.predict(X_test)
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.9083333333333333
the 'addr' feature represents geographical IP addresses, you can convert them into coordinates (latitude and longitude) and analyze the spatial distribution of data points.
# Initialize the GeoLite2 reader
reader = geolite2.reader()
# Function to get coordinates from IP addresses
def get_coordinates(ip):
location_data = reader.get(ip)
if location_data and 'location' in location_data:
return location_data['location']['latitude'], location_data['location']['longitude']
return None, None
# Get coordinates for each IP address
merged_df['latitude'], merged_df['longitude'] = zip(*merged_df['addr'].apply(get_coordinates))
# Close the GeoLite2 reader
geolite2.close()
# Create a map centered at the mean latitude and longitude values
map_center = [merged_df['latitude'].mean(), merged_df['longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=2)
# Add markers for each IP address
for index, row in merged_df.iterrows():
if pd.notna(row['latitude']) and pd.notna(row['longitude']):
folium.Marker([row['latitude'], row['longitude']], tooltip=row['addr']).add_to(m)
# Display the map
m
# Create a Scatter Mapbox plot
fig = px.scatter_mapbox(merged_df, lat='latitude', lon='longitude', hover_name='addr',
mapbox_style='carto-positron', zoom=2)
# Show the plot
fig.show()
# Define the number of clusters (you can determine the optimal number using methods like the Elbow method)
n_clusters = 3
merged_df.dropna(inplace=True)
# Create a KMeans model
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# Fit the model using latitude and longitude
coordinates = merged_df[['latitude', 'longitude']]
kmeans.fit(coordinates)
# Get cluster labels
merged_df['cluster'] = kmeans.labels_
# Plot the clustered data points
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
fig = px.scatter_mapbox(merged_df, lat='latitude', lon='longitude', color='cluster',
hover_name='addr', mapbox_style='carto-positron', zoom=2,
title='K-Means Clustering')
fig.show()
C:\ProgramData\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning C:\ProgramData\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.
scaler = StandardScaler()
coordinates = merged_df[['latitude', 'longitude']]
scaled_coordinates = scaler.fit_transform(coordinates)
dbscan = DBSCAN(eps=1, min_samples=2)
dbscan.fit(scaled_coordinates)
merged_df['dbscan_cluster'] = dbscan.labels_
agg = AgglomerativeClustering(n_clusters=3)
agg.fit(scaled_coordinates)
merged_df['agg_cluster'] = agg.labels_
fig = px.scatter_mapbox(merged_df, lat='latitude', lon='longitude', color='dbscan_cluster',
hover_name='addr', mapbox_style='carto-positron', zoom=2,
title='DBSCAN Clustering')
fig.show()
fig = px.scatter_mapbox(merged_df, lat='latitude', lon='longitude', color='agg_cluster',
hover_name='addr', mapbox_style='carto-positron', zoom=2,
title='Agglomerative Clustering')
fig.show()
scaler = StandardScaler()
coordinates = merged_df[['latitude', 'longitude']]
scaled_coordinates = scaler.fit_transform(coordinates)
agg = AgglomerativeClustering(n_clusters=3)
agg.fit(scaled_coordinates)
merged_df['agg_cluster'] = agg.labels_
def plot_box(df, feature, cluster_col):
fig = px.box(merged_df, x=cluster_col, y=feature, title=f'{feature.capitalize()} Distribution by Cluster')
fig.show()
plot_box(merged_df, 'temperature', 'agg_cluster')
plot_box(merged_df, 'speed', 'agg_cluster')
cluster_summary = merged_df.groupby('agg_cluster').agg({
'temperature': ['mean', 'median', 'std'],
'speed': ['mean', 'median', 'std']
})
print(cluster_summary)
temperature speed
mean median std mean median std
agg_cluster
0 24.867521 25.0 3.134258 150.021368 150.0 28.719368
1 24.830935 25.0 3.113412 149.467626 149.0 29.771404
2 25.160000 25.0 3.197395 158.200000 155.0 30.575044
def iqr_outliers(merged_df, column):
Q1 = merged_df[column].quantile(0.25)
Q3 = merged_df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = merged_df[(merged_df[column] < lower_bound) | (merged_df[column] > upper_bound)]
return outliers
print("IQR Outliers:")
print("Temperature:", iqr_outliers(merged_df, 'temperature'))
print("Wind:", iqr_outliers(merged_df, 'wind'))
print("Speed:", iqr_outliers(merged_df, 'speed'))
IQR Outliers: Temperature: Empty DataFrame Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster] Index: [] Wind: Empty DataFrame Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster] Index: [] Speed: Empty DataFrame Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster] Index: []
def z_score_outliers(df, column, threshold=2):
z_scores = np.abs(stats.zscore(df[column]))
outliers = df[z_scores > threshold]
return outliers
print("\nZ-score Outliers:")
print("Temperature:", z_score_outliers(merged_df, 'temperature'))
print("Wind:", z_score_outliers(merged_df, 'wind'))
print("Speed:", z_score_outliers(merged_df, 'speed'))
Z-score Outliers: Temperature: Empty DataFrame Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster] Index: [] Wind: Empty DataFrame Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster] Index: [] Speed: Empty DataFrame Columns: [sender, receiver, date, temperature, wind, speed, port, addr, peers, honest, month, year, day_of_week, sender_encoded, latitude, longitude, cluster, dbscan_cluster, agg_cluster] Index: []
correlation_matrix = merged_df.corr()
correlation_matrix
C:\Users\sgk\AppData\Local\Temp\ipykernel_14512\3842175115.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
| temperature | wind | speed | port | month | year | day_of_week | sender_encoded | latitude | longitude | cluster | dbscan_cluster | agg_cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| temperature | 1.000000 | 0.089504 | 0.040735 | -0.013230 | -0.040469 | 0.054897 | -0.011577 | 0.060711 | 0.019469 | -0.016675 | 0.030297 | 0.011943 | 0.006436 |
| wind | 0.089504 | 1.000000 | 0.091357 | 0.015203 | 0.028749 | -0.056781 | 0.028394 | -0.043638 | 0.042026 | 0.045152 | 0.042482 | -0.027122 | -0.064909 |
| speed | 0.040735 | 0.091357 | 1.000000 | 0.084844 | -0.004437 | 0.015983 | -0.067476 | 0.065391 | -0.039479 | 0.021823 | -0.000241 | -0.054323 | 0.024742 |
| port | -0.013230 | 0.015203 | 0.084844 | 1.000000 | 0.003070 | 0.001866 | 0.000703 | 0.044636 | -0.099132 | -0.018504 | -0.016333 | -0.023332 | 0.078715 |
| month | -0.040469 | 0.028749 | -0.004437 | 0.003070 | 1.000000 | -0.965079 | -0.072037 | -0.001030 | -0.054375 | -0.029413 | -0.040569 | -0.014832 | 0.064478 |
| year | 0.054897 | -0.056781 | 0.015983 | 0.001866 | -0.965079 | 1.000000 | 0.061758 | 0.004424 | 0.031919 | 0.029090 | 0.024055 | 0.021042 | -0.048618 |
| day_of_week | -0.011577 | 0.028394 | -0.067476 | 0.000703 | -0.072037 | 0.061758 | 1.000000 | 0.023619 | 0.022731 | -0.024866 | -0.012000 | -0.064942 | -0.000542 |
| sender_encoded | 0.060711 | -0.043638 | 0.065391 | 0.044636 | -0.001030 | 0.004424 | 0.023619 | 1.000000 | -0.021767 | -0.041993 | 0.027412 | -0.070786 | 0.066477 |
| latitude | 0.019469 | 0.042026 | -0.039479 | -0.099132 | -0.054375 | 0.031919 | 0.022731 | -0.021767 | 1.000000 | -0.035060 | 0.241434 | 0.170454 | -0.522622 |
| longitude | -0.016675 | 0.045152 | 0.021823 | -0.018504 | -0.029413 | 0.029090 | -0.024866 | -0.041993 | -0.035060 | 1.000000 | 0.651369 | -0.082993 | -0.753771 |
| cluster | 0.030297 | 0.042482 | -0.000241 | -0.016333 | -0.040569 | 0.024055 | -0.012000 | 0.027412 | 0.241434 | 0.651369 | 1.000000 | -0.015637 | -0.766240 |
| dbscan_cluster | 0.011943 | -0.027122 | -0.054323 | -0.023332 | -0.014832 | 0.021042 | -0.064942 | -0.070786 | 0.170454 | -0.082993 | -0.015637 | 1.000000 | -0.104322 |
| agg_cluster | 0.006436 | -0.064909 | 0.024742 | 0.078715 | 0.064478 | -0.048618 | -0.000542 | 0.066477 | -0.522622 | -0.753771 | -0.766240 | -0.104322 | 1.000000 |
honest_df = merged_df[merged_df['honest'] == 'true']
dishonest_df = merged_df[merged_df['honest'] == 'false']
#ttest_ind is a statistical method that is used to compare the means of two independent groups of data.
t_stat, t_pvalue = ttest_ind(honest_df['speed'], dishonest_df['speed'])
print("t-test:")
print(" t-statistic:", t_stat)
print(" p-value:", t_pvalue)
t-test: t-statistic: 0.28016801917050665 p-value: 0.7794568968744121
#mannwhitneyu is a statistical method that is used to compare the distributions of two independent samples of data.
mw_stat, mw_pvalue = mannwhitneyu(honest_df['speed'], dishonest_df['speed'])
print("\nMann-Whitney U test:")
print(" U-statistic:", mw_stat)
print(" p-value:", mw_pvalue)
Mann-Whitney U test: U-statistic: 8539.5 p-value: 0.7962009507861991
t_stat, t_pvalue = ttest_ind(honest_df['temperature'], dishonest_df['temperature'])
print("t-test:")
print(" t-statistic:", t_stat)
print(" p-value:", t_pvalue)
t-test: t-statistic: -1.5302179746204665 p-value: 0.12655382391911785
contingency_table = pd.crosstab(merged_df['sender'], merged_df['honest'])
print("Contingency Table:")
contingency_table
Contingency Table:
| honest | false | true |
|---|---|---|
| sender | ||
| 0385da65-9ab0-46ac-a6d2-462041bf1fe1 | 0 | 2 |
| 040a8064-3026-4249-82b8-75c8a1aae388 | 1 | 0 |
| 06d4ce0d-44bc-430d-807f-c79765424e6b | 0 | 1 |
| 072d5042-68a7-489d-972b-b9e3074eaf2a | 0 | 2 |
| 07432308-5a09-4e9a-9d1a-6314c59d741f | 0 | 3 |
| ... | ... | ... |
| f90b1084-4a0f-47e6-9567-345700e794d5 | 0 | 2 |
| fab75436-bea2-4040-9692-cdc8d0ffbfc7 | 0 | 1 |
| fb2839d3-a266-46e6-a048-823b6706b1f6 | 0 | 2 |
| fde2d368-f04d-4072-b04e-2980b0583999 | 0 | 2 |
| fee60cd9-ff19-4f84-a156-03d99a91006e | 0 | 1 |
339 rows × 2 columns
chi2_stat, chi2_pvalue, chi2_dof, chi2_ex = chi2_contingency(contingency_table)
print("\nChi-squared Test:")
print(" chi2-statistic:", chi2_stat)
print(" p-value:", chi2_pvalue)
print(" degrees of freedom:", chi2_dof)
Chi-squared Test: chi2-statistic: 537.0 p-value: 2.9653844335890057e-11 degrees of freedom: 338
fullfalse=contingency_table[contingency_table['false'] !=0]
fullfalse
| honest | false | true |
|---|---|---|
| sender | ||
| 040a8064-3026-4249-82b8-75c8a1aae388 | 1 | 0 |
| 16555781-840a-42dd-9636-b2c849251f1b | 1 | 0 |
| 22cd4c98-9162-4798-bd6d-cc013da972a5 | 1 | 0 |
| 23a6eb58-1ee3-4bc3-ad7e-a9e1d7d0f571 | 1 | 0 |
| 2becdd30-78f0-433e-bb6e-17a798c721a9 | 1 | 0 |
| 2cb28562-bc2e-457f-ac18-cf5c0662dd7e | 1 | 0 |
| 31305180-08c8-44a7-b15d-c2980bdec662 | 2 | 0 |
| 467de89c-6638-4458-8fa9-e701afd83120 | 1 | 0 |
| 4f572263-4be2-436a-8e48-d9aa044de09d | 1 | 0 |
| 54e07a5f-8fe0-45eb-90e0-e8762c21fe2c | 1 | 0 |
| 66c4766a-1630-4b5a-b859-f09e97b4e616 | 1 | 0 |
| 6c192683-b510-47f4-b57f-28338454b3ea | 1 | 0 |
| 7571ba27-71fa-4368-b4f5-5ce1223f95a5 | 2 | 0 |
| 7ecb595b-74cd-40e3-a73a-72c204d6439a | 2 | 0 |
| 852f464a-b23c-4e09-a2b2-f41ed8997d4d | 2 | 0 |
| 9403b508-516f-4ae1-80d2-e707862d0124 | 1 | 0 |
| a1926bf3-7aa3-4466-990e-68322aa7a029 | 1 | 0 |
| a34dffb9-03af-4b51-ab62-d7ba0ff954a2 | 1 | 0 |
| c9ea49da-9bc3-450a-9959-619991d58f4a | 2 | 0 |
| ca8818bb-f56e-4b3b-8c96-9f6760134498 | 2 | 0 |
| d9777e82-84c0-4a13-881a-73264ade6b88 | 2 | 0 |
| da658947-0a2a-4b29-ad69-5792f0477a91 | 1 | 0 |
| dc64a949-18ba-4329-99e9-9d65b6de8f35 | 1 | 0 |
| e8a24155-20f3-44e6-ac5b-acdca7361c1c | 2 | 0 |
| e8fd134e-25f0-40ad-b60e-934bb334e7f8 | 1 | 0 |
falsecontent=merged_df[merged_df['honest']=='false']
falsecontent.reset_index(drop=True)
| sender | receiver | date | temperature | wind | speed | port | addr | peers | honest | month | year | day_of_week | sender_encoded | latitude | longitude | cluster | dbscan_cluster | agg_cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ca8818bb-f56e-4b3b-8c96-9f6760134498 | 41e0ab50-b719-406e-9ceb-ef02e6446633 | 2022-12-20 06:12:03 | 26 | 4 | 106 | 28292 | 11.253.5.52 | [163.179.28.48, 169.104.39.230, 137.218.170.21... | false | 12 | 2022 | 1 | 306 | 37.7510 | -97.8220 | 0 | 0 | 1 |
| 1 | ca8818bb-f56e-4b3b-8c96-9f6760134498 | ddb3af13-5834-48d3-87b5-eab45b909df2 | 2022-09-07 18:06:55 | 24 | 1 | 131 | 28292 | 11.253.5.52 | [163.179.28.48, 169.104.39.230, 137.218.170.21... | false | 9 | 2022 | 2 | 306 | 37.7510 | -97.8220 | 0 | 0 | 1 |
| 2 | 54e07a5f-8fe0-45eb-90e0-e8762c21fe2c | 7dd00fd4-7ca5-4098-80d0-ae6ef8f17d45 | 2022-11-05 07:39:39 | 30 | 16 | 106 | 20633 | 131.227.31.206 | [166.143.47.13, 55.195.18.250, 184.193.5.40, 1... | false | 11 | 2022 | 5 | 137 | 51.2167 | -0.5667 | 2 | 0 | 0 |
| 3 | 31305180-08c8-44a7-b15d-c2980bdec662 | 2ca1f8d3-b7bc-4e71-91b7-98c0ed150783 | 2022-12-23 09:55:47 | 29 | 2 | 119 | 36164 | 181.111.234.129 | [19.98.38.90, 20.189.192.124, 165.185.167.47, ... | false | 12 | 2022 | 4 | 77 | -34.6033 | -58.3817 | 0 | 0 | 2 |
| 4 | 31305180-08c8-44a7-b15d-c2980bdec662 | 0998bce7-f2a6-4209-8f27-372d5dad026c | 2022-12-28 09:32:51 | 28 | 13 | 118 | 36164 | 181.111.234.129 | [19.98.38.90, 20.189.192.124, 165.185.167.47, ... | false | 12 | 2022 | 2 | 77 | -34.6033 | -58.3817 | 0 | 0 | 2 |
| 5 | 467de89c-6638-4458-8fa9-e701afd83120 | ea7a0491-f376-405c-b783-640ed4d22881 | 2023-01-07 10:49:48 | 22 | 7 | 195 | 33442 | 218.12.116.115 | [65.126.189.120, 227.253.255.235, 163.179.28.4... | false | 1 | 2023 | 5 | 116 | 39.8897 | 115.2750 | 1 | 0 | 0 |
| 6 | c9ea49da-9bc3-450a-9959-619991d58f4a | 4cd58a99-ac8e-42ea-bfe6-bcd3081b12d9 | 2022-09-28 04:21:58 | 21 | 2 | 132 | 26354 | 182.213.244.139 | [237.115.30.245, 152.16.93.56, 95.133.14.18, 1... | false | 9 | 2022 | 2 | 305 | 37.5112 | 126.9741 | 1 | 0 | 0 |
| 7 | c9ea49da-9bc3-450a-9959-619991d58f4a | c5ce06d0-4d16-4357-ad5d-40cbfa28759e | 2022-11-22 04:22:01 | 21 | 6 | 105 | 26354 | 182.213.244.139 | [237.115.30.245, 152.16.93.56, 95.133.14.18, 1... | false | 11 | 2022 | 1 | 305 | 37.5112 | 126.9741 | 1 | 0 | 0 |
| 8 | 7ecb595b-74cd-40e3-a73a-72c204d6439a | ee013164-f367-44aa-9ee4-af0e93a7266c | 2023-01-17 01:11:01 | 26 | 7 | 135 | 29864 | 67.219.82.92 | [241.68.182.221, 225.87.130.255, 36.238.70.103... | false | 1 | 2023 | 1 | 190 | 39.6347 | -87.2639 | 0 | 0 | 1 |
| 9 | 7ecb595b-74cd-40e3-a73a-72c204d6439a | 234324db-3aec-4b5a-b2bc-9bd5eafaacdb | 2022-10-10 04:55:49 | 30 | 3 | 200 | 29864 | 67.219.82.92 | [241.68.182.221, 225.87.130.255, 36.238.70.103... | false | 10 | 2022 | 0 | 190 | 39.6347 | -87.2639 | 0 | 0 | 1 |
| 10 | a1926bf3-7aa3-4466-990e-68322aa7a029 | db2c61c2-d2d9-4d6b-9a15-4040e65a3eaf | 2022-10-22 22:25:14 | 27 | 0 | 179 | 34788 | 157.31.252.164 | [152.16.93.56, 61.124.213.100, 24.106.152.13, ... | false | 10 | 2022 | 5 | 241 | 33.4499 | -112.0712 | 0 | 0 | 1 |
| 11 | da658947-0a2a-4b29-ad69-5792f0477a91 | b4c691de-b4d4-4a70-8ea1-60f7d1eeaf85 | 2022-10-03 17:49:59 | 25 | 9 | 191 | 30640 | 149.104.28.112 | [212.208.191.97, 132.254.204.81, 201.177.102.2... | false | 10 | 2022 | 0 | 324 | 37.7510 | -97.8220 | 0 | 0 | 1 |
| 12 | e8a24155-20f3-44e6-ac5b-acdca7361c1c | b58f8c37-5bac-47f8-b653-52914f3ba2dd | 2022-11-02 21:03:00 | 25 | 8 | 147 | 31343 | 212.208.191.97 | [56.48.105.145, 26.231.127.224, 26.245.153.56,... | false | 11 | 2022 | 2 | 342 | 48.7649 | 1.9292 | 2 | 0 | 0 |
| 13 | e8a24155-20f3-44e6-ac5b-acdca7361c1c | c7ae9dd6-a1a0-49f2-a344-3b4eff4569d4 | 2022-11-07 09:16:10 | 28 | 6 | 189 | 31343 | 212.208.191.97 | [56.48.105.145, 26.231.127.224, 26.245.153.56,... | false | 11 | 2022 | 0 | 342 | 48.7649 | 1.9292 | 2 | 0 | 0 |
| 14 | 2becdd30-78f0-433e-bb6e-17a798c721a9 | a1926bf3-7aa3-4466-990e-68322aa7a029 | 2023-01-27 11:46:32 | 21 | 10 | 161 | 28575 | 43.103.237.170 | [65.90.216.7, 23.155.172.158, 211.209.228.35, ... | false | 1 | 2023 | 4 | 67 | 35.6850 | 139.7514 | 1 | 0 | 0 |
| 15 | 4f572263-4be2-436a-8e48-d9aa044de09d | 26d1d8b0-39ca-4e9c-9c39-0776bf63b370 | 2022-12-21 20:15:16 | 21 | 2 | 108 | 28150 | 173.217.108.104 | [234.116.2.6, 27.56.41.235, 73.194.2.180, 86.7... | false | 12 | 2022 | 2 | 130 | 31.6835 | -93.1197 | 0 | 0 | 1 |
| 16 | 2cb28562-bc2e-457f-ac18-cf5c0662dd7e | 6fdbb56b-a0bd-4afc-aada-3dff045341f7 | 2022-09-04 13:17:07 | 29 | 2 | 144 | 35973 | 178.18.13.198 | [119.15.105.40, 73.194.2.180, 153.127.186.137,... | false | 9 | 2022 | 6 | 70 | 53.5303 | 49.3461 | 2 | 0 | 0 |
| 17 | a34dffb9-03af-4b51-ab62-d7ba0ff954a2 | 3223c80f-ee5e-433d-a4ec-da0593588f36 | 2022-09-21 00:56:57 | 26 | 14 | 157 | 24713 | 61.225.107.205 | [68.246.74.148, 50.151.200.203, 37.238.110.20,... | false | 9 | 2022 | 2 | 245 | 23.5000 | 121.0000 | 1 | 0 | 0 |
| 18 | 7571ba27-71fa-4368-b4f5-5ce1223f95a5 | c9ea49da-9bc3-450a-9959-619991d58f4a | 2022-12-16 04:53:54 | 29 | 8 | 128 | 34410 | 150.209.40.178 | [182.213.244.139, 178.18.13.198, 98.170.14.91,... | false | 12 | 2022 | 4 | 179 | 43.0361 | -75.3817 | 0 | 0 | 1 |
| 19 | 7571ba27-71fa-4368-b4f5-5ce1223f95a5 | 0ed4ea90-7224-4654-8c53-d7084ab7718d | 2022-10-27 04:41:10 | 22 | 15 | 177 | 34410 | 150.209.40.178 | [182.213.244.139, 178.18.13.198, 98.170.14.91,... | false | 10 | 2022 | 3 | 179 | 43.0361 | -75.3817 | 0 | 0 | 1 |
| 20 | 16555781-840a-42dd-9636-b2c849251f1b | 3f59a5f8-b903-48f2-9691-907b3046ecb1 | 2022-09-22 09:19:23 | 28 | 7 | 104 | 24089 | 76.99.145.77 | [33.6.151.178, 71.190.70.177, 93.26.72.207, 14... | false | 9 | 2022 | 3 | 33 | 40.2933 | -75.7441 | 0 | 0 | 1 |
| 21 | d9777e82-84c0-4a13-881a-73264ade6b88 | 7bcdf7b7-35dd-49a1-be6c-4e8aaf4a8811 | 2023-01-08 07:42:23 | 28 | 10 | 170 | 36417 | 23.253.209.155 | [35.251.80.187, 6.231.245.255, 220.214.217.101... | false | 1 | 2023 | 6 | 322 | 29.4889 | -98.3987 | 0 | 0 | 1 |
| 22 | d9777e82-84c0-4a13-881a-73264ade6b88 | 218b4f1d-8977-4a31-8ff9-c68b5db7cac0 | 2022-09-01 01:22:40 | 24 | 2 | 171 | 36417 | 23.253.209.155 | [35.251.80.187, 6.231.245.255, 220.214.217.101... | false | 9 | 2022 | 3 | 322 | 29.4889 | -98.3987 | 0 | 0 | 1 |
| 23 | 22cd4c98-9162-4798-bd6d-cc013da972a5 | d2d64234-f05d-42f4-bf23-392c538da4e7 | 2023-01-10 21:06:40 | 23 | 12 | 152 | 28294 | 90.90.66.105 | [4.130.37.44, 186.32.76.3, 153.119.59.158, 139... | false | 1 | 2023 | 1 | 49 | 48.9833 | 2.2333 | 2 | 0 | 0 |
| 24 | 040a8064-3026-4249-82b8-75c8a1aae388 | aad31313-444b-4477-9df1-7806b0ae31af | 2022-10-20 04:57:32 | 28 | 12 | 168 | 31852 | 90.36.7.56 | [5.137.192.15, 67.202.183.135, 37.13.63.149, 2... | false | 10 | 2022 | 3 | 1 | 16.2333 | -61.3667 | 0 | 0 | 1 |
| 25 | 852f464a-b23c-4e09-a2b2-f41ed8997d4d | 6ea944b0-028a-4532-97b8-ce9ce1225fb2 | 2022-12-05 13:12:25 | 24 | 3 | 181 | 23050 | 24.223.71.151 | [216.139.145.182, 67.35.12.255, 184.182.38.122... | false | 12 | 2022 | 0 | 198 | 41.7101 | -103.6648 | 0 | 0 | 1 |
| 26 | 852f464a-b23c-4e09-a2b2-f41ed8997d4d | b56d6a9c-7454-47c0-ad98-2e56a0b9d012 | 2022-12-30 12:49:50 | 26 | 10 | 170 | 23050 | 24.223.71.151 | [216.139.145.182, 67.35.12.255, 184.182.38.122... | false | 12 | 2022 | 4 | 198 | 41.7101 | -103.6648 | 0 | 0 | 1 |
| 27 | dc64a949-18ba-4329-99e9-9d65b6de8f35 | aba972fb-cc42-49e8-a5e0-546bf2de79d4 | 2022-10-05 05:11:54 | 30 | 7 | 199 | 29942 | 112.76.206.99 | [24.223.71.151, 21.117.23.46, 87.231.93.255, 2... | false | 10 | 2022 | 2 | 327 | 37.5112 | 126.9741 | 1 | 0 | 0 |
| 28 | e8fd134e-25f0-40ad-b60e-934bb334e7f8 | dd9a8706-3f47-4a37-b864-74c84ce1e4c7 | 2022-09-04 19:42:19 | 26 | 12 | 101 | 25818 | 110.75.115.42 | [245.37.200.78, 105.159.233.225, 149.21.126.20... | false | 9 | 2022 | 6 | 343 | 30.2936 | 120.1614 | 1 | 0 | 0 |
| 29 | 9403b508-516f-4ae1-80d2-e707862d0124 | 072d5042-68a7-489d-972b-b9e3074eaf2a | 2022-09-29 16:34:30 | 26 | 14 | 156 | 33940 | 86.120.23.203 | [132.38.19.227, 150.126.208.240, 217.101.36.18... | false | 9 | 2022 | 3 | 225 | 45.6333 | 25.5833 | 2 | 0 | 0 |
| 30 | 6c192683-b510-47f4-b57f-28338454b3ea | 86b1dd0b-ddef-4968-88f7-bda1aaeebfb6 | 2022-10-14 16:22:05 | 20 | 1 | 185 | 35158 | 152.190.100.232 | [87.91.229.167, 216.37.69.51, 235.226.18.8, 20... | false | 10 | 2022 | 4 | 163 | 39.0180 | -77.5390 | 0 | 0 | 1 |
| 31 | 23a6eb58-1ee3-4bc3-ad7e-a9e1d7d0f571 | c7dc8e68-ccc1-479f-9d6e-076d8d9d6fbe | 2023-01-14 02:02:36 | 24 | 1 | 120 | 39825 | 176.40.17.118 | [147.242.90.109, 180.231.161.196, 22.245.208.1... | false | 1 | 2023 | 5 | 52 | 39.9264 | 32.8888 | 2 | 0 | 0 |
| 32 | 66c4766a-1630-4b5a-b859-f09e97b4e616 | 66a38013-24fd-4806-8205-bd0a5bb96093 | 2023-01-11 23:55:31 | 30 | 3 | 103 | 37407 | 17.157.180.94 | [33.139.142.206, 87.196.152.101, 176.40.17.118... | false | 1 | 2023 | 2 | 157 | 37.3042 | -122.0946 | 0 | 0 | 1 |
# Initialize the GeoLite2 reader
reader = geolite2.reader()
# Close the GeoLite2 reader
geolite2.close()
# Create a map centered at the mean latitude and longitude values
map_center = [falsecontent['latitude'].mean(), falsecontent['longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=2)
# Add markers for each IP address
for index, row in falsecontent.iterrows():
if pd.notna(row['latitude']) and pd.notna(row['longitude']):
folium.Marker([row['latitude'], row['longitude']], tooltip=row['addr']).add_to(m)
# Display the map
m